package com.cheng.spider.extract;

import com.cheng.spider.constant.MimeType;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class HtmlDataResolver extends TextDataResolver {

    private static final Logger LOGGER = LoggerFactory.getLogger(HtmlDataResolver.class);

    protected ExtractRule extractRule;

    public HtmlDataResolver() {
        extractRule = initExtractorRule();
    }

    @Override
    protected Result parseText(String text, MimeType mimeType) {
        Document document = convertText2Jsoup(text, mimeType);
        return parseDoc(document, mimeType);
    }

    protected Result parseDoc(Document document, MimeType mimeType) {
        return extractRule.extract(document);
    }

    private Document convertText2Jsoup(String text, MimeType mimeType) {
        return Jsoup.parse(text, mimeType.getCharset());
    }

    protected abstract ExtractRule initExtractorRule();
}
